{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "turned-permit",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "import json\n",
    "from math import ceil\n",
    "\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.manifold import TSNE\n",
    "from sklearn.cluster import DBSCAN, KMeans\n",
    "from sklearn.base import clone\n",
    "from sklearn.neighbors import NearestNeighbors\n",
    "from yellowbrick.cluster import KElbowVisualizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "under-throat",
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install --target=/Users/franz/opt/anaconda3/envs/Data_visualization/lib/python3.8/site-packages/ yellowbrick"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "sustainable-chuck",
   "metadata": {},
   "source": [
    "## Data Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "prime-evening",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"final_poster.csv\")\n",
    "european_union = json.load(open(\"european-union-countries.geojson\", \"r\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "committed-absorption",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.rename(columns={\"Unnamed: 0\": \"Country\"}, inplace=True)\n",
    "df[\"Country\"].replace({\"Czechia\": \"Czech Republic\"}, inplace=True)\n",
    "df.set_index('Country', inplace = True)\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "plain-tourist",
   "metadata": {},
   "source": [
    "## Data Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "increased-julian",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "df_before = df.copy()\n",
    "\n",
    "cols = df.columns\n",
    "rows = df.index\n",
    "\n",
    "dic = {}\n",
    "i = 0\n",
    "\n",
    "for element in cols:\n",
    "    dic[i]=element\n",
    "    i = i + 1\n",
    "\n",
    "x = df.values #returns a numpy array\n",
    "min_max_scaler = preprocessing.MinMaxScaler()\n",
    "x_scaled = min_max_scaler.fit_transform(x)\n",
    "df_scaled = pd.DataFrame(x_scaled)\n",
    "df_scaled = df_scaled.rename(columns=dic)\n",
    "df_scaled\n",
    "df_scaled.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "prime-kidney",
   "metadata": {},
   "source": [
    "### Create Perspectives"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "suitable-italy",
   "metadata": {},
   "outputs": [],
   "source": [
    "health = df_scaled[[\"Obesity\", \"Diabetes Prevalence\", \"Cardiovascular Death Rate\", \n",
    "         \"Life Expectancy\", \"Health Expenditure (% of GDP)\"]]\n",
    "food = df_scaled.iloc[:,:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "broke-favorite",
   "metadata": {},
   "source": [
    "### Performing PCA for Dimensionality Reduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "going-tower",
   "metadata": {},
   "outputs": [],
   "source": [
    "# PCA\n",
    "def explained_variance(dataframe):\n",
    "    \"Calculates the Eigenvalues for each Principal Component in a dataframe.\"\n",
    "    \n",
    "    pca = PCA()\n",
    "    pca_feat = pca.fit_transform(dataframe)\n",
    "    \n",
    "    return np.cumsum(pca.explained_variance_ratio_), pca.n_components_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "economic-departure",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_principal_components(dataframe, nPC):\n",
    "    \"\"\"Gets PC values for each observation in dataframe and creates new dataframe.\"\"\"\n",
    "    \n",
    "    pca = PCA(nPC)\n",
    "    pca_feat = pca.fit_transform(dataframe)\n",
    "    pca_names = [f\"PC{i}\" for i in range(pca.n_components_)]\n",
    "    pca_df = pd.DataFrame(pca_feat, index=dataframe.index, columns=pca_names)\n",
    "    return pca_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "applicable-animation",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Food\n",
    "print(explained_variance(food)[0].round(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "formed-afternoon",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Health\n",
    "print(explained_variance(health)[0].round(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "concrete-nigeria",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Total\n",
    "total = pd.concat((food, health), axis=1)\n",
    "print(explained_variance(total)[0].round(3))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "violent-above",
   "metadata": {},
   "source": [
    "PCA nur für Food, bei Health kann man auch gleich alle Variablen benutzen weil cumulative explained variance nicht bei PC1 oder \" richtig reinkickt."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "latter-stability",
   "metadata": {},
   "outputs": [],
   "source": [
    "pca_food = get_principal_components(food, 8)\n",
    "pca_total = get_principal_components(food, 9)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "demonstrated-ozone",
   "metadata": {},
   "source": [
    "## Clustering K-Means\n",
    "### Food"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fossil-vacuum",
   "metadata": {},
   "outputs": [],
   "source": [
    "# KMeans Elbow\n",
    "model = KMeans(pca_food)\n",
    "visualizer = KElbowVisualizer(model, k=(1,10))\n",
    "\n",
    "visualizer.fit(pca_food)    \n",
    "visualizer.show()    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "furnished-circular",
   "metadata": {},
   "outputs": [],
   "source": [
    "number_clusters = 5\n",
    "kmclust = KMeans(\n",
    "    n_clusters=number_clusters, \n",
    "    init='k-means++',\n",
    "    n_init=10, \n",
    "    random_state=27,\n",
    "    tol= 1e-4)\n",
    "km_labels_food = kmclust.fit_predict(pca_food)\n",
    "km_labels_food"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "olive-limitation",
   "metadata": {},
   "source": [
    "### Health"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "hungry-newton",
   "metadata": {},
   "outputs": [],
   "source": [
    "# KMeans Elbow\n",
    "model = KMeans(health)\n",
    "visualizer = KElbowVisualizer(model, k=(1,10))\n",
    "\n",
    "visualizer.fit(health)    \n",
    "visualizer.show()    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "mounted-tension",
   "metadata": {},
   "outputs": [],
   "source": [
    "number_clusters = 3\n",
    "kmclust = KMeans(\n",
    "    n_clusters=number_clusters, \n",
    "    init='k-means++',\n",
    "    n_init=10, \n",
    "    random_state=27,\n",
    "    tol= 1e-4)\n",
    "km_labels_health = kmclust.fit_predict(health)\n",
    "km_labels_health"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "intimate-overhead",
   "metadata": {},
   "source": [
    "### Total"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "special-ecuador",
   "metadata": {},
   "outputs": [],
   "source": [
    "# KMeans Elbow\n",
    "model = KMeans(total)\n",
    "visualizer = KElbowVisualizer(model, k=(1,10))\n",
    "\n",
    "visualizer.fit(total)    \n",
    "visualizer.show()    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "nuclear-orbit",
   "metadata": {},
   "outputs": [],
   "source": [
    "number_clusters = 4\n",
    "kmclust = KMeans(\n",
    "    n_clusters=number_clusters, \n",
    "    init='k-means++',\n",
    "    n_init=10, \n",
    "    random_state=27,\n",
    "    tol= 1e-4)\n",
    "km_labels_total = kmclust.fit_predict(total)\n",
    "km_labels_total"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "nervous-entrepreneur",
   "metadata": {},
   "source": [
    "### Evaluating K-Means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "preliminary-profit",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ss(df):\n",
    "    ss = np.sum(df.var() * (df.count() - 1))\n",
    "    return ss  # return sum of sum of squares of each df variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "european-catholic",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans_food = pd.concat((pca_food, pd.Series(km_labels_food, name='labels')), axis=1)\n",
    "sst = get_ss(kmeans_food.drop(columns=[\"labels\"]))  # get total sum of squares\n",
    "ssw_labels = kmeans_food.groupby(by='labels').apply(get_ss)  # compute ssw for each cluster labels\n",
    "ssb = sst - np.sum(ssw_labels)\n",
    "r2 = ssb / sst\n",
    "print(\"R^2 kmeans:\", r2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "coordinate-magnitude",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans_health = pd.concat((health, pd.Series(km_labels_health, name='labels')), axis=1)\n",
    "sst = get_ss(kmeans_health.drop(columns=[\"labels\"]))  # get total sum of squares\n",
    "ssw_labels = kmeans_health.groupby(by='labels').apply(get_ss)  # compute ssw for each cluster labels\n",
    "ssb = sst - np.sum(ssw_labels)\n",
    "r2 = ssb / sst\n",
    "print(\"R^2 kmeans:\", r2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "finished-eclipse",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans_total = pd.concat((total, pd.Series(km_labels_total, name='labels')), axis=1)\n",
    "sst = get_ss(kmeans_total.drop(columns=[\"labels\"]))  # get total sum of squares\n",
    "ssw_labels = kmeans_total.groupby(by='labels').apply(get_ss)  # compute ssw for each cluster labels\n",
    "ssb = sst - np.sum(ssw_labels)\n",
    "r2 = ssb / sst\n",
    "print(\"R^2 kmeans:\", r2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "worth-programming",
   "metadata": {},
   "source": [
    "## Visualization of Clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "static-cruise",
   "metadata": {},
   "outputs": [],
   "source": [
    "clusters = pd.concat((df_scaled, pd.Series(km_labels_food, name='cluster_food'),\n",
    "                      pd.Series(km_labels_health, name='cluster_health'),\n",
    "                      pd.Series(km_labels_total, name='cluster_total'),\n",
    "                      pd.Series(rows, name='Country'),\n",
    "                      ), axis=1)\n",
    "clusters.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "quiet-silly",
   "metadata": {},
   "source": [
    "## Geoplot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "personal-artwork",
   "metadata": {},
   "outputs": [],
   "source": [
    "state_id_map = {}\n",
    "for feature in european_union[\"features\"]:\n",
    "    feature[\"id\"] = feature[\"properties\"][\"gu_a3\"]\n",
    "    state_id_map[feature[\"properties\"][\"name_long\"]] = feature[\"id\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dress-trailer",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create new column with polygon info\n",
    "clusters[\"id\"] = clusters[\"Country\"].apply(lambda x: state_id_map[x])\n",
    "clusters.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "complicated-commercial",
   "metadata": {},
   "source": [
    "### Food"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "wanted-calculation",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = px.choropleth_mapbox(\n",
    "    clusters,\n",
    "    locations=\"id\",\n",
    "    geojson=european_union,\n",
    "    color=\"cluster_food\",\n",
    "    hover_name=\"Country\",\n",
    "    hover_data=[\"Life Expectancy\"],\n",
    "    title=\"clusters by food\",\n",
    "    mapbox_style=\"carto-positron\", # kann hier auch darkmode u.a. - mit api token noch mehr aber to much\n",
    "    center={\"lat\": 56.5, \"lon\": 11},\n",
    "    zoom=2.5,\n",
    "    opacity=0.5, # können das hier weniger transparent machen\n",
    "    height=600,\n",
    "    width=550\n",
    ")\n",
    "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "stainless-variety",
   "metadata": {},
   "source": [
    "### Health"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "likely-plain",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = px.choropleth_mapbox(\n",
    "    clusters,\n",
    "    locations=\"id\",\n",
    "    geojson=european_union,\n",
    "    color=\"cluster_health\",\n",
    "    hover_name=\"Country\",\n",
    "    hover_data=[\"Life Expectancy\"],\n",
    "    title=\"Clusters by health\",\n",
    "    mapbox_style=\"carto-positron\", # kann hier auch darkmode u.a. - mit api token noch mehr aber to much\n",
    "    center={\"lat\": 56.5, \"lon\": 11},\n",
    "    zoom=2.5,\n",
    "    opacity=0.5, # können das hier weniger transparent machen\n",
    "    height=600,\n",
    "    width=550\n",
    ")\n",
    "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "prerequisite-motivation",
   "metadata": {},
   "source": [
    "### Total"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "precise-dependence",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = px.choropleth_mapbox(\n",
    "    clusters,\n",
    "    locations=\"id\",\n",
    "    geojson=european_union,\n",
    "    color=\"cluster_total\",\n",
    "    hover_name=\"Country\",\n",
    "    hover_data=[\"Life Expectancy\"],\n",
    "    title=\"Clusters by health\",\n",
    "    mapbox_style=\"carto-positron\", # kann hier auch darkmode u.a. - mit api token noch mehr aber to much\n",
    "    center={\"lat\": 56.5, \"lon\": 11},\n",
    "    zoom=2.5,\n",
    "    opacity=0.5, # können das hier weniger transparent machen\n",
    "    height=600,\n",
    "    width=550\n",
    ")\n",
    "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n",
    "\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cloudy-dictionary",
   "metadata": {},
   "source": [
    "## Cluster Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "animal-tackle",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "green-brain",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "potential-music",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "entertaining-nashville",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "coated-irish",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "every-willow",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "twelve-connectivity",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
